import pandas as pd
import numpy as np
import datetime
from matplotlib import pyplot as plt
import pickle
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
from plotly.tools import FigureFactory as ff
import plotly.graph_objs as graph
#setting
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
df = pd.read_csv('/Turnstile_Usage_Data__2019.csv')
description of the variables can be found here: http://web.mta.info/developers/turnstile.html
#simple understanding of dataset
df.info()
#create unique turnstile id for each turnstile
df['Turnstile_ID'] = df['Unit'].astype(str) + "_"+ df['SCP'].astype(str) +"_"+ df['Station'].astype(str)
#clean up column name
df.rename(columns = {'Exits ':'Exits'},inplace = True)
#trim dataset so that it runs faster
columns_to_keep = ['Turnstile_ID','Station','Date','Time','Entries','Exits']
df_new = pd.DataFrame(df[columns_to_keep])
#convert date and time into datetime format
df_new['Date'] = df_new.Date.apply(lambda x: datetime.datetime.strptime(x, '%m/%d/%Y'))
df_new['Time'] = df_new.Time.apply(lambda x: datetime.datetime.strptime(x, '%H:%M:%S').time())
df_new.head()
quantile_to_flag = 12150 #assuming 30 pp passing through per minute
total_num_entry = len(df_new)
print(total_num_entry)
#calculate row wise difference in absolute value (accounting for backwards counting)
df_new['entry_diff_abs'] = df_new['Entries'].diff(periods = 1).abs()
df_new['exit_diff_abs'] = df_new['Exits'].diff(periods = 1).abs()
#reset the start of every new turnstile
df_new['entry_diff_abs'] = np.where(df_new['Turnstile_ID'] == df_new['Turnstile_ID'].shift(periods = 1,fill_value = 0),
df_new.entry_diff_abs,0)
df_new['exit_diff_abs'] = np.where(df_new['Turnstile_ID'] == df_new['Turnstile_ID'].shift(periods = 1,fill_value = 0),
df_new.exit_diff_abs,0)
# check for data that is abnormally high
df_new['flag_high_entry_diff'] = df_new['entry_diff_abs'] > quantile_to_flag
df_new['flag_high_exit_diff'] = df_new['exit_diff_abs'] > quantile_to_flag
total_abno = np.sum([df_new.flag_high_entry_diff.sum(),df_new['flag_high_exit_diff'].sum()])
total_abno/total_num_entry*100
#abnormal entries are less than 0.1% of total data, we will discard outliners here
rows_to_drop = df_new[df_new['flag_high_entry_diff']].index.tolist() + df_new[df_new['flag_high_exit_diff']].index.tolist()
df = pd.DataFrame(df_new.drop(rows_to_drop))
df = df.rename(columns = {'entry_diff_abs':'Entry_diff','exit_diff_abs':'Exit_diff'})
#calculate net flow as the sum of people entering station and people exiting station
df['Net_Flow'] = df['Entry_diff'] + df['Exit_diff']
columns_to_keep = ['Turnstile_ID','Station','Date','Time','Exit_diff','Entry_diff','Net_Flow']
df_cleaned = pd.DataFrame(df[columns_to_keep])
df_cleaned.head()
df_cleaned.to_pickle('../data/processed/data_final.pkl')
df = pd.read_pickle('../data/processed/data_final.pkl') #test if pickle works
df.head()
#Station flow per year
station_flow_annual = df.groupby(by = 'Station')['Net_Flow'].sum()
station_flow_annual[0]
station_flow_annual = df.groupby(by = 'Station')['Net_Flow'].sum()
station_flow_annual.sort_values(ascending = False,inplace = True)
print(station_flow_annual.index[:20]) #list of top 20
top_20_crowded_stations = station_flow_annual.index[:20]
top_20_crowded_stations
#retrieve day of the week from Date
df['Date'] = pd.to_datetime(df['Date'])
df['Day'] = df['Date'].dt.dayofweek
df['Day'] +=1
# extract weekday or weekend from day of week
def weekday_weekend(day):
if 1 <= day <= 5:
return 'Weekday'
if 6 <= day <= 7:
return 'Weekend'
#apply function
df['Day_type'] = df.apply(lambda x: weekday_weekend(x['Day']), axis =1)
df.head(1)
#aggregate net flow based on station and day of week, and get the average traffic of each day of the week
avg_net_flow_by_day_of_week= df.groupby(['Station', 'Day'])['Net_Flow'].mean().reset_index(name='Mean_traffic_per_station_by_day_of_week')
avg_net_flow_by_day_of_week= avg_net_flow_by_day_of_week.sort_values('Mean_traffic_per_station_by_day_of_week', ascending =False)
avg_net_flow_by_day_of_week.head(5)
# apply the weekday/weekend function on `Day`
avg_net_flow_by_day_of_week['Day_type'] = avg_net_flow_by_day_of_week.apply(lambda x: weekday_weekend(x['Day']), axis =1)
avg_net_flow_by_day_of_week.head()
#get the average traffic on weekend and weekday
avg_net_flow_by_day_type = avg_net_flow_by_day_of_week.groupby(['Station', 'Day_type'])['Mean_traffic_per_station_by_day_of_week'].mean().reset_index(name='Mean_traffic_per_station_by_day_type')
avg_net_flow_by_day_type.head(5)
#check if the output is what we intended
test = avg_net_flow_by_day_type.loc[avg_net_flow_by_day_type['Station']== 'YORK ST']
test
#get the ratio of weekday and weekend
# use this ratio as a proxy to determine what stations are filled with tourist
# hypothesis: ratio that is close to 1 are 'touristy' stations cuz the
# average traffic on weekdays are similar to weekends
avg_net_flow_by_day_type['Ratio_of_traffic_per_station_by_day_type'] = avg_net_flow_by_day_type['Mean_traffic_per_station_by_day_type'].div(avg_net_flow_by_day_type.groupby('Station')['Mean_traffic_per_station_by_day_type'].shift(1))
avg_net_flow_by_day_type.head()
net_flow_ratio_histogram_distribution = px.histogram(avg_net_flow_by_day_type, x="Ratio_of_traffic_per_station_by_day_type", nbins=50,histnorm='percent')
net_flow_ratio_histogram_distribution.show()
avg_net_flow_by_day_type['Ratio_of_traffic_per_station_by_day_type'].describe()
#Most of the `ratio` fall below 2, and there's an outlier of 8. hence we'll restrict the range of ratio
#restrict the range of ratio to below 2
avg_net_flow_by_day_type_remove_outlier = avg_net_flow_by_day_type.loc[avg_net_flow_by_day_type['Ratio_of_traffic_per_station_by_day_type'] <2 ]
avg_net_flow_by_day_type_remove_outlier.head(3)
ratio_histogram_distribution = px.histogram(avg_net_flow_by_day_type_remove_outlier, x="Ratio_of_traffic_per_station_by_day_type", nbins=50,histnorm='percent')
ratio_histogram_distribution.show()
# with a ratio of between 0.9 and 1.1 are considered as 'touristy' stations
toursity_stations = avg_net_flow_by_day_type_remove_outlier.loc[((avg_net_flow_by_day_type_remove_outlier['Ratio_of_traffic_per_station_by_day_type'] >= 0.9)
& (avg_net_flow_by_day_type_remove_outlier['Ratio_of_traffic_per_station_by_day_type'] <= 1.1))
, 'Station'].unique()
toursity_stations
len(toursity_stations)
#these are the 20 stations of with the most traffic
top_20_crowded_stations
toursity_stations_set = set(toursity_stations)
top_20_crowded_stations_set = set(top_20_crowded_stations)
toursity_stations_set.intersection(top_20_crowded_stations_set)
# there are no toursity stations that fall under the top 20 crowded stations
df_w_most_locals = df.loc[df['Station'].isin(top_20_crowded_stations)]
df_w_most_locals_agg_total_traffic = df_w_most_locals.groupby(['Station','Day'])['Net_Flow'].sum().reset_index(name='Total_traffic')
df_w_most_locals_agg_total_traffic.head(5)
df_w_most_locals_agg_total_traffic_copy = df_w_most_locals_agg_total_traffic.copy()
df_w_most_locals_agg_total_traffic_copy['Day'].replace({1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 4: 'Thursday', 5: 'Friday', 6: 'Saturday', 7: 'Sunday'}, inplace = True)
#top 10 stations by day of the week
line_fig = px.line(df_w_most_locals_agg_total_traffic_copy, x="Day", y="Total_traffic", color = 'Station', title='Total traffic per day of the week')
line_fig.show()
df_w_most_locals_agg_total_traffic_copy['Station'].unique()
# '34 ST-PENN STA', 'GRD CNTRL-42 ST', '34 ST-HERALD SQ', '23 ST' and 'TIMES SQ-42 ST' are the top 5 stations
df_5_stations_w_most_locals = df.loc[df['Station'].isin(['34 ST-PENN STA', 'GRD CNTRL-42 ST', '34 ST-HERALD SQ', '23 ST','TIMES SQ-42 ST'])]
df_5_stations_w_most_locals_agg_total_traffic = df_5_stations_w_most_locals.groupby(['Station','Day'])['Net_Flow'].sum().reset_index(name='Total_traffic')
df_5_stations_w_most_locals_agg_total_traffic_sorted = df_5_stations_w_most_locals_agg_total_traffic.sort_values(by = ['Day', 'Total_traffic'], ascending = [True,False])
df_5_stations_w_most_locals_agg_total_traffic_sorted.head(10)
df_5_stations_w_most_locals_agg_total_traffic_copy = df_5_stations_w_most_locals_agg_total_traffic_sorted.copy()
df_5_stations_w_most_locals_agg_total_traffic_copy['Day'].replace({1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 4: 'Thursday', 5: 'Friday', 6: 'Saturday', 7: 'Sunday'}, inplace = True)
top_5_line_fig = px.line(df_5_stations_w_most_locals_agg_total_traffic_copy, x="Day", y="Total_traffic", color = 'Station', title='Total traffic per day of the week')
top_5_line_fig.show()
# retrieve the hour from the `Time` variable
df_5_stations_w_most_locals['Time'] = pd.to_datetime(df_5_stations_w_most_locals['Time'], format='%H:%M:%S')
df_5_stations_w_most_locals['Hour'] = df_5_stations_w_most_locals['Time'].dt.hour
df_5_stations_w_most_locals.head(1)
df_5_stations_w_most_locals_selected_hours = df_5_stations_w_most_locals.loc[df_5_stations_w_most_locals['Hour'].isin(np.arange(7,24))]
sorted(df_5_stations_w_most_locals_selected_hours['Hour'].unique())
# aggregate the timing into different time blocks
# `Time` aggregates entry and exits on 4 hourly basis.
# the timestamp 09:00 refers to data at 05:00
def time_block(hour):
if 9 <= hour == 12: #in reality: 5am to 9am; morning
return 'Morning'
if 13 <= hour <= 18: #in reality: 9am to 2pm; afternoon
return 'Afternoon'
if 19 <= hour <= 21: #in reality: 3pm to 5pm; late afternoon
return 'Late afternoon'
if 22 <= hour <= 26: #in reality: 6pm to 10pm; evening
return 'Evening'
# check if the assignment of time_block is as intended
df_5_stations_w_most_locals_selected_hours['Time_of_day'] = df_5_stations_w_most_locals_selected_hours['Hour'].apply(time_block)
df_5_stations_w_most_locals_selected_hours.head(1)
len(df_5_stations_w_most_locals_selected_hours)
#since there are more traffic on weekdays, we'll further break down
# the traffic into the various time blocks
df_5_stations_w_most_locals_selected_hours_weekday = df_5_stations_w_most_locals_selected_hours.loc[df_5_stations_w_most_locals_selected_hours['Day_type']== 'Weekday']
df_5_stations_w_most_locals_selected_hours_weekday_agg = df_5_stations_w_most_locals_selected_hours_weekday.groupby(['Station','Time_of_day'])['Net_Flow'].sum().reset_index(name='Total_traffic')
df_5_stations_w_most_locals_selected_hours_weekday_agg = df_5_stations_w_most_locals_selected_hours_weekday_agg.sort_values(by = ['Time_of_day', 'Total_traffic'], ascending = [True,False])
df_5_stations_w_most_locals_selected_hours_weekday_agg
# hypothesis: if there are more traffic at a certain station
# at a certain time block compared to other stations or other time block,
# this can further optimise the allocation of street teams
init_notebook_mode()
data = df_5_stations_w_most_locals_selected_hours_weekday_agg
trace = graph.Heatmap(z=data.Total_traffic,y=data.Station,x=data.Time_of_day)
plots = [trace]
layout = graph.Layout(
yaxis=dict(
type='category',
categoryorder='category ascending',
showgrid=False
),
xaxis=dict(
type='category',
categoryorder='array',
categoryarray=['Morning', 'Afternoon', 'Late afternoon', 'Evening'],
showgrid=False
)
)
fig = graph.Figure(data=plots, layout=layout)
iplot(fig)
#across the 5 stations, traffic builds up steadily from morning to late afternoon. Could start with a smaller team size
#in the morning, and increase the team size in the afternoon and late afternoon
#Grand Central-42 St is the most crowded in the late afternoon. Could increase the number of street teams
#during this period, by deploying teams from stations that are less crowded
#by evening time, traffic reduced drastically compared to morning and late afternoon. could further investigate
#if other stations are more crowded in the evening, and deploy street teams at these 5 stations to those other stations
total_traffic_by_day_type = df.groupby(['Station','Day_type'])['Net_Flow'].sum().reset_index(name='Total_traffic_by_day_type')
total_traffic_weekend = total_traffic_by_day_type.loc[total_traffic_by_day_type['Day_type']=='Weekend']
total_traffic_weekend_sorted = total_traffic_weekend.sort_values(by='Total_traffic_by_day_type', ascending = False)
total_traffic_weekend_sorted.head(20)
# the top 10 stations that are most crowded on weekends are also the ones that have highest traffic on weekdays.
# can deploy street teams to these 10 stations on both weekdays and weekends
highest_traffic_station_weekend = total_traffic_weekend_sorted.head(20)['Station'].tolist()
highest_traffic_station_weekend_set = set(highest_traffic_station_weekend)
common = toursity_stations_set.intersection(highest_traffic_station_weekend_set)
common
toursity_stations_set
highest_traffic_station_weekend
# there are no toursity stations that fall under the top 20 crowded stations on weekends
top_20_crowded_stations_set.intersection(highest_traffic_station_weekend_set)
len(top_20_crowded_stations_set.intersection(highest_traffic_station_weekend_set))
#among the top 20 crowded stations, 17 of them have the highest traffic on weekends
#this reinforces the recommendation to deploy street teams to these 10 stations on both weekdays and weekends
This is a preliminary EDA based on traffic flow using MTA data. Can further optimize street team allocation by enriching the data with